In [1]:
from IPython.display import IFrame, display, HTML

import pandas as pd
import numpy as np

from jinja2 import Template

from bokeh.models import (
    ColumnDataSource, Plot, Circle, Range1d, 
    LinearAxis, TapTool, HoverTool, Text,
    SingleIntervalTicker,
)
from bokeh.models.actions import Callback
from bokeh.models.widgets import Slider
from bokeh.palettes import Spectral6
from bokeh.plotting import vplot, hplot
from bokeh.resources import Resources
from bokeh.embed import components

Get the Data


In [2]:
# Links via http://www.gapminder.org/data/ 
"""
population_url = "http://spreadsheets.google.com/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&output=xls"
fertility_url = "http://spreadsheets.google.com/pub?key=phAwcNAVuyj0TAlJeCEzcGQ&output=xls"
life_expectancy_url = "http://spreadsheets.google.com/pub?key=tiAiXcrneZrUnnJ9dBU-PAw&output=xls"

def get_data(url):
    # Get the data from the url and return only 1962 - 2013
    df = pd.read_excel(url, index_col=0)
    df = df.unstack().unstack()
    df = df[(df.index >= 1964) & (df.index <= 2013)]
    df = df.unstack().unstack()    
    return df

fertility_df = get_data(fertility_url)
life_expectancy_df = get_data(life_expectancy_url)
population_df = get_data(population_url)

fertility_df.to_hdf('fertility_df.hdf', 'df')
life_expectancy_df.to_hdf('life_expectancy_df.hdf', 'df')
population_df.to_hdf('population_df.hdf', 'df')
"""
fertility_df = pd.read_hdf('fertility_df.hdf', 'df')
life_expectancy_df = pd.read_hdf('life_expectancy_df.hdf', 'df')
population_df = pd.read_hdf('population_df.hdf', 'df')

In [3]:
# have common countries across all data
fertility_df = fertility_df.drop(fertility_df.index.difference(life_expectancy_df.index))
population_df = population_df.drop(population_df.index.difference(life_expectancy_df.index))

# get a size value based on population, but don't let it get too small
population_df_size = np.sqrt(population_df/np.pi)/200
min_size = 3
population_df_size = population_df_size.where(population_df_size >= min_size).fillna(min_size)

In [12]:
fertility_df.to_csv('fertility.csv')
population_df.to_csv('population.csv')
life_expectancy_df.to_csv('life_expectancy.csv')

Get the regions and color them


In [4]:
regions_url = "https://docs.google.com/spreadsheets/d/1OxmGUNWeADbPJkQxVPupSOK5MbAECdqThnvyPrwG5Os/pub?gid=1&output=xls"
regions_df = pd.read_excel(regions_url, index_col=0)
regions_df = regions_df.drop(regions_df.index.difference(life_expectancy_df.index))
#regions_df.Group = regions_df.Group.astype('category')
#cats = list(regions_df.Group.cat.categories)
#def get_color(r):
#    index = cats.index(r.Group)
    return Spectral6[cats.index(r.Group)]
regions_df['region_color'] = regions_df.apply(get_color, axis=1)

Build the plot

We build it using html to use the html slider widget instead of the ipython widget. This makes the plot more re-useable and means the slider will work on NBViewer.


In [5]:
# Set up the data. 
#
# We make a dictionary of sources that can then be passed to the callback so they are ready for JS object to use.
#
# Dictionary_of_sources is:
# {
#   1962: '_1962',
#   1963: '_1963',
#   ....
# }
# We turn this into a string  and replace '_1962' with _1962. So the end result is js_source_array:
# '{1962: _1962, 1963: _1963, ....}'
#
# When this is passed into the callback and then accessed at runtime,
# the _1962, _1963 are replaced with the actual source objects that are passed in as args.

sources = {}

years = list(fertility_df.columns)

region_color = regions_df['region_color']
region_color.name = 'region_color'


for year in years:
    fertility = fertility_df[year]
    fertility.name = 'fertility'
    life = life_expectancy_df[year]
    life.name = 'life' 
    population = population_df_size[year]
    population.name = 'population' 
    new_df = pd.concat([fertility, life, population, region_color], axis=1)
    sources['_' + str(year)] = ColumnDataSource(new_df)

dictionary_of_sources = dict(zip([x for x in years], ['_%s' % x for x in years]))
js_source_array = str(dictionary_of_sources).replace("'", "")


# Set up the plot
xdr = Range1d(1, 9)
ydr = Range1d(20, 100)
plot = Plot(
    x_range=xdr,
    y_range=ydr,
    title="",
    plot_width=800,
    plot_height=400,
    outline_line_color=None,
    toolbar_location=None,    
)
AXIS_FORMATS = dict(
    minor_tick_in=None,
    minor_tick_out=None,
    major_tick_in=None,
    major_label_text_font_size="10pt",
    major_label_text_font_style="normal",
    axis_label_text_font_size="10pt",

    axis_line_color='#AAAAAA',
    major_tick_line_color='#AAAAAA',
    major_label_text_color='#666666',

    major_tick_line_cap="round",
    axis_line_cap="round",
    axis_line_width=1,
    major_tick_line_width=1,
)

xaxis = LinearAxis(SingleIntervalTicker(interval=1), axis_label="Live births per woman", **AXIS_FORMATS)
yaxis = LinearAxis(SingleIntervalTicker(interval=20), axis_label="Average life expectancy (years)", **AXIS_FORMATS)   
plot.add_layout(xaxis, 'below')
plot.add_layout(yaxis, 'left')

# Add the year in background (add before circle)
text_source = ColumnDataSource({'year': ['%s' % years[0]]})
text = Text(x=2, y=35, text='year', text_font_size='150pt', text_color='#EEEEEE')
plot.add_glyph(text_source, text)

# Add the circle
renderer_source = sources['_%s' % years[0]]
circle_glyph = Circle(
    x='fertility', y='life', size='population',
    fill_color='region_color', fill_alpha=0.8, 
    line_color='#7c7e71', line_width=0.5, line_alpha=0.5)
circle_renderer = plot.add_glyph(renderer_source, circle_glyph)

# Add the hover (only against the circle and not other plot elements)
tooltips = "@index"
plot.add_tools(HoverTool(tooltips=tooltips, renderers=[circle_renderer]))


# Add the slider
code = """
    var year = slider.get('value'),
        sources = %s,
        new_source_data = sources[year].get('data');
    renderer_source.set('data', new_source_data);
    renderer_source.trigger('change');
    text_source.set('data', {'year': [String(year)]});
    text_source.trigger('change');
""" % js_source_array

callback = Callback(args=sources, code=code)
slider = Slider(start=years[0], end=years[-1], value=1, step=1, title="Year", callback=callback)
callback.args["slider"] = slider
callback.args["renderer_source"] = renderer_source
callback.args["text_source"] = text_source

# Add the legend
text_x = 7
text_y = 95
text_properties = dict(x=text_x, text_font_size='10pt', text_color='#666666')
circle_properties = dict(size=10, line_color=None, fill_alpha=0.8)
for i, region in enumerate(cats):
    plot.add_glyph(Text(y=text_y, text=[region], **text_properties))
    plot.add_glyph(Circle(x=text_x - 0.1, y=text_y + 2, fill_color=Spectral6[i], **circle_properties))
    text_y = text_y - 5
    
# Stick the plot and the slider together
layout = vplot(plot, hplot(slider))

Embed in your own template


In [6]:
with open('my_template.html', 'r') as f:
    template = Template(f.read())
    
script, div = components(layout)
html = template.render(
    title="Bokeh - Gapminder demo",
    plot_script=script,
    plot_div=div,
)

display(HTML(html))


Bokeh - Gapminder demo

Bokeh - Gapminder example

In Hans Rosling's famous TED Talk he showed why our perceptions of a "first" world and a "developing" world are wrong and that the world is now a spectrum of developing countries and many advances have been made since our early notions of development from the 60s.

To Do

  • Responsive
  • Make page layout decent

Would be nice

  • Play button
  • Click on legend and subset are selected

In [7]:
with open('gapminder.html', 'w') as f:
    f.write(html)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: